2  PBMCs: Kmeans Clustering Analysis

2.1 Package Loading Set Up

.libPaths(c("/homes8/bohoon/R/x86_64-pc-linux-gnu-library/4.1", "/homes8/bohoon/R/x86_64-pc-linux-gnu-library/4.2", "/homes8/bohoon/R/x86_64-pc-linux-gnu-library/4.3", "/homes8/bohoon/R/x86_64-pc-linux-gnu-library/not_4.3", "/homes8/bohoon/R/x86_64-pc-linux-gnu-library/4.4"))
library(dplyr)
Warning: package 'dplyr' was built under R version 4.4.0

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(purrr)
library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.4.0
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ readr     2.1.5     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

2.2 Loading in Unfiltered Dataframe

PT106_merged_df <- readRDS("./PT106_merged_df.rds")
PT108_merged_df <- readRDS("./PT108_merged_df.rds")
PT111_merged_df <- readRDS("./PT111_merged_df.rds")
PT113_merged_df <- readRDS("./PT113_merged_df.rds")
PT114_merged_df <- readRDS( "./PT114_merged_df.rds")
PT116_merged_df <- readRDS("./PT116_merged_df.rds")
PT117_merged_df <- readRDS("./PT117_merged_df.rds")
PT118_merged_df <- readRDS("./PT118_merged_df.rds")
PT119_merged_df <- readRDS("./PT119_merged_df.rds")
PT122_merged_df <- readRDS("./PT122_merged_df.rds")


patient_dfs_postnivo <- list(
  PT108 = PT108_merged_df,
  PT113 = PT113_merged_df,
  PT114 = PT114_merged_df, 
  PT116 = PT116_merged_df,
  PT117 = PT117_merged_df, 
  PT118 = PT118_merged_df, 
  PT119 = PT119_merged_df,
  PT122 = PT122_merged_df
)



# Find common columns across all dataframes (trying to merge across dataframes)
common_cols_postnivo <- Reduce(intersect, lapply(patient_dfs_postnivo, colnames))
# Subset all dataframes to only those columns
patient_dfs_postnivo <- lapply(patient_dfs_postnivo, function(df) df[,common_cols_postnivo])

# Combine all dataframes into one list, adding patient ID
combined_df_postnivo <- bind_rows(
  lapply(names(patient_dfs_postnivo), function(pid) {
    df <- patient_dfs_postnivo[[pid]]
    df$patient_id <- pid
    return(df)
  })
)

df_matrix_postnivo <- combined_df_postnivo %>%
  mutate(clone_label = paste(clonotype_id, patient_id, sep = "_")) %>%
  dplyr::select(clone_label, Prevaccine_umi_fraction, Postvaccine_umi_fraction, Postnivo_umi_fraction)

umi_matrix_postnivo <- as.matrix(df_matrix_postnivo[, -1])
rownames(umi_matrix_postnivo) <- df_matrix_postnivo$clone_label

2.3 Running Kmeans Analysis

## Running Kmeans Analysis 
set.seed(6)

sds_nivo <- apply(umi_matrix_postnivo, 1, sd)
filtered_postnivo <- umi_matrix_postnivo[sds_nivo > 0,]

scaled_umi_matrix_postnivo <- t(scale(t(filtered_postnivo)))
km_postnivo <- kmeans(scaled_umi_matrix_postnivo, centers = 5, iter.max = 50)
km_postnivo$centers
  Prevaccine_umi_fraction Postvaccine_umi_fraction Postnivo_umi_fraction
1              -1.0937522               0.79305016             0.3007020
2              -1.0265353               0.08903749             0.9374978
3              -0.5603236               1.14732493            -0.5870013
4              -0.5575350              -0.58951352             1.1470486
5               1.1330068              -0.56216007            -0.5708468

2.4 Plotting variances based on number of Ks

# Analogous to Elbow Method for finding the optimal number of clusters
set.seed(4)
# plot k = 2 to k = 15.
k.max <- 15
data <- umi_matrix_postnivo
wss <- sapply(1:k.max, 
              function(k){kmeans(data, k, iter.max = 50)$tot.withinss})
plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

2.5 Plotting the centroids and the SD from the centroids

# Get cluster assignment for each row
cluster_assignments <- km_postnivo$cluster
scaled_mat <- as.data.frame(scaled_umi_matrix_postnivo)
scaled_mat$cluster <- factor(cluster_assignments)
scaled_mat$id <- rownames(scaled_mat)

# Reshape to long format
long_scaled <- scaled_mat %>%
  pivot_longer(cols = c(Prevaccine_umi_fraction, Postvaccine_umi_fraction, Postnivo_umi_fraction),
               names_to = "timepoint", values_to = "value")

centroid_long <- km_postnivo$centers %>%
  as.data.frame() %>%
  mutate(cluster = factor(rownames(.))) %>%
  pivot_longer(
    cols = -cluster,  # pivot all columns except 'cluster'
    names_to = "timepoint",
    values_to = "centroid_value"
  )

# Join data to get centroid per cluster + timepoint
joined <- long_scaled %>%
  left_join(centroid_long, by = c("cluster", "timepoint")) %>%
  mutate(sq_diff = (value - centroid_value)^2)

# Calculate SD around the centroid (not the mean)
summary_stats <- joined %>%
  group_by(cluster, timepoint, centroid_value) %>%
  summarise(
    sd = sqrt(mean(sq_diff)),
    lower = centroid_value - sd,
    upper = centroid_value + sd,
    .groups = "drop"
  )
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.
summary_stats <- summary_stats %>%
    distinct(cluster, timepoint, .keep_all = TRUE)

2.6 Plotting means of each cluster with error bar

summary_stats$timepoint <- factor(summary_stats$timepoint, levels=c("Prevaccine_umi_fraction", "Postvaccine_umi_fraction", "Postnivo_umi_fraction"))
ggplot(summary_stats, aes(x = timepoint, y = centroid_value, group = cluster, color = cluster, fill = cluster)) +
  geom_line(size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2, linetype = 0) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(y = "Scaled frequency", x = "Timepoint", title = "Cluster Centroids with SD")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

2.7 Calculating scaled frequency means of each cluster divided by response groups

#faceting this by patient / response group
cluster_assignments <- km_postnivo$cluster

#can generate dataframe with both the scaled and the raw frequencies. 

scaled_mat <- as.data.frame(scaled_umi_matrix_postnivo)
scaled_mat$cluster <- factor(cluster_assignments)
scaled_mat$id <- rownames(scaled_mat)
scaled_mat$patient <- str_extract(scaled_mat$id, "PT\\d+")


# Example patient groups
progressive <- c("PT111", "PT114", "PT117", "PT113")
transient_response <- c("PT106", "PT108", "PT116")
stable_disease <- c("PT118", "PT119", "PT122")

# Add response column
scaled_mat <- scaled_mat %>%
  mutate(response = case_when(
    patient %in% progressive ~ "Progressive",
    patient %in% transient_response ~ "Transient_Response",
    patient %in% stable_disease ~ "Stable_Disease",
    TRUE ~ NA_character_  # For patients not in any group
  ))


# Reshape
long_scaled <- scaled_mat %>%
  pivot_longer(cols = c(Prevaccine_umi_fraction, Postvaccine_umi_fraction, Postnivo_umi_fraction),
               names_to = "timepoint", values_to = "value")

# generating summary stats
summary_stats <- long_scaled %>%
  group_by(cluster,timepoint, response) %>%
  summarise(
    mean_val = mean(value, na.rm = TRUE),
    sd = sd(value, na.rm = TRUE),
    count = n(),
    lower = mean_val - sd,
    upper = mean_val + sd,
    .groups = "drop"
  )


summary_stats <- summary_stats %>%
    distinct(cluster,timepoint,response, .keep_all = TRUE)

2.8 Plotting the mean frequency within each cluster with its SE

summary_stats$timepoint <- factor(summary_stats$timepoint, levels=c("Prevaccine_umi_fraction", "Postvaccine_umi_fraction", "Postnivo_umi_fraction"))

label_data <- summary_stats[summary_stats$timepoint == "Postnivo_umi_fraction", ]

ggplot(summary_stats, aes(x = timepoint, y = mean_val, group = cluster, color = cluster)) +
  geom_line(size = 1) +
  geom_ribbon(aes(ymin = lower, ymax = upper, fill = cluster), alpha = 0.2, linetype = 0) +
  ggrepel::geom_text_repel(
    data = label_data, aes(label = paste0("n=", count), color = cluster),, 
    size = 3, show.legend = FALSE, 
    direction = "y", box.padding = 0.0005,
    label.padding = 0.0005, nudge_x = 0.2
  ) +
  scale_color_manual(
    values = c(
      "1" = "#E41A1C",  # red
      "2" = "#B3B300",  # olive/yellow
      "3" = "#00B2B2",  # teal
      "4" = "#4DA6FF",  # light blue
      "5" = "#FF99FF"   # pink
    )
  ) +
  scale_fill_manual(
    values = c(
      "1" = "#E41A1C",
      "2" = "#B3B300",
      "3" = "#00B2B2",
      "4" = "#4DA6FF",
      "5" = "#FF99FF"
    )
  ) +
  theme_classic(base_size = 14) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  labs(
    y = "Raw Frequency", 
    x = "Timepoint", 
    title = "Cluster Mean for Denovo Clones Separated by Response"
  ) + facet_wrap(~response)
Warning in ggrepel::geom_text_repel(data = label_data, aes(label = paste0("n=",
: Ignoring unknown parameters: `label.padding`

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Rocky Linux 8.10 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /usr/lib64/libopenblasp-r0.3.15.so;  LAPACK version 3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: America/New_York
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] lubridate_1.9.4 forcats_1.0.0   stringr_1.5.1   readr_2.1.5    
 [5] tidyr_1.3.1     tibble_3.2.1    ggplot2_3.5.1   tidyverse_2.0.0
 [9] purrr_1.0.2     dplyr_1.1.4    

loaded via a namespace (and not attached):
 [1] gtable_0.3.5       jsonlite_2.0.0     compiler_4.3.2     Rcpp_1.1.0        
 [5] tidyselect_1.2.1   dichromat_2.0-0.1  scales_1.4.0       fastmap_1.2.0     
 [9] R6_2.6.1           labeling_0.4.3     generics_0.1.3     knitr_1.50        
[13] htmlwidgets_1.6.4  ggrepel_0.9.6      pillar_1.10.1      RColorBrewer_1.1-3
[17] tzdb_0.5.0         rlang_1.1.5        stringi_1.8.7      xfun_0.52         
[21] timechange_0.3.0   cli_3.6.4          withr_3.0.2        magrittr_2.0.3    
[25] digest_0.6.37      grid_4.3.2         rstudioapi_0.17.1  hms_1.1.3         
[29] lifecycle_1.0.4    vctrs_0.6.5        evaluate_1.0.3     glue_1.8.0        
[33] farver_2.1.2       rmarkdown_2.29     tools_4.3.2        pkgconfig_2.0.3   
[37] htmltools_0.5.8.1